# Directory globals

BASEDIR <- "/data3/buildingcodes-jpe-data/"
INPUT_PUBLIC <- file.path(BASEDIR, "input/public/") 
INPUT_PRIVATE <- file.path(BASEDIR, "input/private/")
ZTRAX <- file.path(INPUT_PRIVATE, "ZTRAX")

WORKING <- file.path(BASEDIR, "working-private")
RES <- "results" # Will also contain public output, e.g., the cleaned up destroyed homes data

pacman::p_load(tidyverse)

set.seed(12345) # Set a seed to insure against any probabilistic factors leading to different results

options(stringsAsFactors = FALSE) ## Do not load strings as factors
options(scipen = 999) # Do not print scientific notation

## Enforce that 'select' and 'filter' call the dplyr command (as opposed to those from the raster package)
select <- dplyr::select
filter <- dplyr::filter

## Formatting functions for summary statistics 
# Override N fxn to include commas
# Ref:
N <- function(x) format(length(x), big.mark = ",")
# https://github.com/vincentarelbundock/modelsummary/issues/656

P5 = function (x, fmt = NULL, na.rm = TRUE, ...) {
  out <- stats::quantile(x, prob = 0.05, na.rm = na.rm)
  return(out)
}


P95 = function (x, fmt = NULL, na.rm = TRUE, ...) {
  out <- stats::quantile(x, prob = 0.95, na.rm = na.rm)
  return(out)
}




globalplotoptions <- theme_classic() + theme(
  axis.text = element_text(size = 14),
  axis.title.x = element_text(size = 14, margin = margin(10, 0, 0, 0)),
  axis.title.y = element_text(size = 14, margin = margin(0, 10, 0, 0)),
  axis.line = element_line(linewidth = 1),
  # strip.background=element_blank(),
  strip.text = element_text(size = 14),
  plot.title = element_text(size = 16, hjust = 0.5)
)

## Post-processing function for etable to get rid of \begingroup, \par\endgroup, and \centering in tables.
trimtable <- function(x) {
  x[grepl("\\\\(begin|end)group", x)] <- ""
  x <- gsub("\\centering", "", x, fixed = T)
  return(x)
}

### Set etable defaults ----
#- For AER, turn off stars with signif.code=NULL. To change back to normal stars, use signif.code=("***"=0.01, "**"=0.05, "*"=0.10).

require(fixest)

# Set up a default data dictionary
dict_names <- c(
  "destroyed" = "Destroyed",
  "destroyed_num" = "Destroyed",
  "treatmentgroup::No-codes" = "No-codes",
  "treatmentgroup::SRA" = "SRA",
  "treatmentgroup::LRA-VHFHSZ" = "LRA-VHFHSZ",
  "period" = "Vintage",
  "period::0" = "Before 1998",
  "period::1" = "1998--2007",
  "period::2" = "2008--2016",
  "slope" = "Ground slope (degrees)",
  "elev" = "Elevation (meters)",
  "wildfire_hazard" = "Wildfire hazard",
  "wildfire_hazard_pct" = "Wildfire hazard (\\%)",
  "fuelmodel" = "Fuel model indicators",
  "LotSizeAcres" = "Lot size (acres)",
  "logLotSize" = "log(Lot size)",
  "sqfeet1000" = "Square feet (thousands)",
  "TotalBedrooms" = "Bedrooms",
  "log_assessed_value" = "log(Assessed value)",
  "incidentid^treatmentgroup" = "Incident",
  "incidentid" = "Incident",
  "streetXfire^treatmentgroup" = "Street",
  "streetXfire" = "Street",
  "streetXfire^treatmentgroup_split" = "Street",
  "streetXfireXside^treatmentgroup" = "Street $\\times$ Side",
  "street25Xfire" = "Sub-street",
  "street25Xfire^treatmentgroup" = "Sub-street",
  "street25XfireXside^treatmentgroup" = "Sub-street $\\times$ Side",
  "streetXfire^block^treatmentgroup" = "Street $\\times$ Block",
  "treatmentgroup_split::NoCodes(CA)" = "No-codes (CA)",
  "treatmentgroup_split::NoCodes(non-CA)" = "No-codes (non-CA)",
  "treatmentgroup_split::SRA" = "SRA",
  "treatmentgroup_split::LRA-VHFHSZ" = "LRA-VHFHSZ",
  "fouryearbin" = "Year built bins",
  "precode" = "Pre-code Neighbors",
  "postcode" = "Post-code Neighbors",
  "precode1TRUE" = "1 Pre-code Neighbor",
  "postcode1TRUE" = "1 Post-code Neighbor",
  "precode2TRUE" = "2+ Pre-code Neighbors",
  "postcode2TRUE" = "2+ Post-code Neighbors",
  "n_Dwalls0_pre" = "Pre-code Neighbors",
  "n_Dwalls0_post" = "Post-code Neighbors",
  "combinedyear" = "Year built",
  "sales_price_1000s" = "Sold price (thousands)",
  "neighbors_in_30m_centroid" = "Neighbors in 30m",
  "persqftlosses" = "Losses per sf",
  "tract" = "Census tract",
  "street^treatmentgroup" = "Street",
  "ImportParcelID" = "Parcel",
  "logprice" = "Log sale price",
  "sale_period" = "Sold period",
  "sale_period::0" = "Sold before 1998",
  "sale_period::1" = "Sold 1998--2007",
  "sale_period::2" = "Sold 2008--2016",
  "growth_rate_cap" = "Home Growth Rate After Building Codes",
  "avg_slope" = "Ground slope (degrees)",
  "avg_elev" = "Elevation (meters)",
  "avg_hazard" = "Wildfire hazard",
  "avg_lot_size" = "Lot size (acres)",
  "avg_sqft" = "Square feet (thousands)", 
  "avg_bedrooms" = "Bedrooms"
)

# Set fixest defaults
setFixest_dict(dict_names, reset = T)

setFixest_etable(
  style.tex = style.tex("aer",
                        signif.code = c("***" = 0.01, "**" = 0.05, "*" = 0.10),
                        fixef.title = "\\midrule",
                        fixef.suffix = " FE",
                        fixef.where = "var",
                        yesNo = "$\\checkmark$",
                        interaction.combine = " $\\times$ "
  ),
  postprocess.tex = trimtable,
  depvar = F,
  order = c("SRA", "LRA", "^1998", "^2008", "slope", "Fuel"),
  group = list(
    "\\\\ Fuel model indicators" = c("Fuel"),
    "Additional controls" = c("Sq. Feet", "Bedrooms", "Lot Size", "Elevation", "Wildfire hazard")
  ),
  fitstat = c("n", "r2", "my"),
  digits = "s3",
  digits.stats = "s2",
  powerBelow = -10
)
# For some reason, the group argument doesn't "take" with setFixest_etable, so we'll have to manually add it in later.

assign_wildfire_hazard <- function(pts) {
  # Save original CRS, to be restored after processing 
  orig_CRS <- st_crs(pts)
  
  # Fire risk rasters ----
  RISK <- file.path(INPUT_PUBLIC, "RiskToStructures","unzipped")
  bp <- raster(file.path(RISK, "BP_All.tif")) # Burn probability
  flep4 <- raster(file.path(RISK, "FLEP4_All.tif")) # Flame length exceedance probability-4 feet
  
  pts$bp <- raster::extract(bp, pts)
  pts$flep <- raster::extract(flep4, pts)
  pts$wildfire_hazard = pts$bp * pts$flep # This is our computed measure of wildfire hazard: the probability of a wildfire with flame length > 4 feet (i.e., a significant fire)
  
  # Restore original CRS
  pts <- st_transform(pts, orig_CRS)
  
  # Return the points
  pts
}


# Assign spatial data to a set of points
assign_spatial <- function(pts) {
  # pts = homes
  # END DEBUG
  
  # Save original CRS, to be restored after processing 
  orig_CRS <- st_crs(pts)
  
  # LANDFIRE rasters ----
  LF <- file.path(INPUT_PUBLIC, "LANDFIRE")
  slope_rast <- raster(file.path(LF, "Slope/Grid/us_slp"))
  elev_rast <- raster(file.path(LF, "DEM_Elevation/Grid/us_dem"))
  aspect_rast <- raster(file.path(LF, "Aspect/Grid/us_asp"))
  fuelmodel_rast <- raster(file.path(LF, "US_140FBFM13_12052016/Grid/us_140fbfm13"))
  
  # Compute raster values for all points
  pts$elev <- raster::extract(elev_rast, pts)
  pts$slope <- raster::extract(slope_rast, pts)
  pts$aspect <- recode_aspect(raster::extract(aspect_rast, pts))
  pts$fuelmodel <- recode_fuelmodel(raster::extract(fuelmodel_rast, pts))
  
  # S/SW aspects are likely higher risk
  pts$s_sw <- pts$aspect %in% c("S", "SW")
  
  block_files <- list.files(file.path(INPUT_PUBLIC, "CensusTiger"), "block", full.names = T)
  blocks_list <- lapply(block_files, st_read) %>% bind_rows()
  
  blocks <- blocks_list %>% select(block = GEOID10)
  
  # Create a temporary id
  pts$tempid <- 1:nrow(pts)
  
  # For a very very small number of homes that are exactly on the border, pick block at random
  pts <- pts %>% 
    st_transform(st_crs(blocks)) %>%
    st_join(blocks)
  
  pts <- pts %>% distinct(tempid, .keep_all = T) %>% select(-tempid)
  
  # Compute tract and block group variables
  pts = pts %>% 
    mutate(tract = substr(block, 1, 11),
           blockgroup = substr(block, 1, 12))
  
  # Restore original CRS
  pts <- st_transform(pts, orig_CRS)
  
  # Return the points
  pts
}

# Function to recode aspect
recode_aspect <- function(aspect) {
  dir_labs <- c(
    "flat", "N", "NE", "E", "SE",
    "S", "SW", "W", "NW", "N"
  )
  cut(aspect,
      breaks = c(-1, 0, seq(45 / 2, 360 - 45 / 2, 45), 360),
      labels = dir_labs,
      right = F, include.lowest = T
  )
}

recode_aspect(c(-0.5, -1, 0, 15, 30, 45, 90, 270, 360))


recode_fuelmodel <- function(x) {
  case_match(
    x,
    1 ~ "Grass1",
    2 ~ "Grass2",
    3 ~ "Grass3",
    4 ~ "Shrub1",
    5 ~ "Shrub2",
    6 ~ "Shrub3",
    7 ~ "Shrub4",
    8 ~ "Timber1",
    9 ~ "Timber2",
    10 ~ "Timber3",
    11 ~ "Slash1",
    12 ~ "Slash2",
    13 ~ "Slash3",
    91:99 ~ "Unburnable"
  )
}

recode_fuelmodel(c(1, 4, 99, NA_integer_))





Mode <- function(x) {
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}



get_SRA_FHSZ_status <- function(pts) {
  # pts <- homepoints # DEBUG
  
  # Assign using a selection of the SRA maps, basically the last one from each year they were available
  pts$SRA_1990 <- get_SRA(pts, file.path(INPUT_PUBLIC, "SRAandFHSZ/HistoricalSRA/SRA_archive/sra90_1.gdb"))
  pts$SRA_1996 <- get_SRA(pts, file.path(INPUT_PUBLIC, "SRAandFHSZ/HistoricalSRA/SRA_archive/sra96_3.gdb"))
  pts$SRA_2003 <- get_SRA(pts, file.path(INPUT_PUBLIC, "SRAandFHSZ/HistoricalSRA/SRA_archive/sra03_3.gdb"))
  pts$SRA_2005 <- get_SRA(pts, file.path(INPUT_PUBLIC, "SRAandFHSZ/HistoricalSRA/SRA_archive/sra05_8.gdb"))
  pts$SRA_2010 <- get_SRA(pts, file.path(INPUT_PUBLIC, "SRAandFHSZ/HistoricalSRA/SRA_archive/sra10_2.gdb"))
  pts$SRA_2011 <- get_SRA(pts, file.path(INPUT_PUBLIC, "SRAandFHSZ/HistoricalSRA/SRA_archive/sra11_2.gdb"))
  pts$SRA_2012 <- get_SRA(pts, file.path(INPUT_PUBLIC, "SRAandFHSZ/HistoricalSRA/SRA_archive/SRA12_2.gdb"))
  pts$SRA_2013 <- get_SRA(pts, file.path(INPUT_PUBLIC, "SRAandFHSZ/HistoricalSRA/SRA_archive/SRA13_2.gdb"))
  pts$SRA_2014 <- get_SRA(pts, file.path(INPUT_PUBLIC, "SRAandFHSZ/HistoricalSRA/SRA_archive/SRA14_2.gdb"))
  pts$SRA_2015 <- get_SRA(pts, file.path(INPUT_PUBLIC, "SRAandFHSZ/HistoricalSRA/SRA_archive/SRA15_2.gdb"))
  pts$SRA_2016 <- get_SRA(pts, file.path(INPUT_PUBLIC, "SRAandFHSZ/HistoricalSRA/SRA_archive/SRA16_2.gdb"))
  pts$SRA_2017 <- get_SRA(pts, file.path(INPUT_PUBLIC, "SRAandFHSZ/HistoricalSRA/SRA_archive/SRA17_2.gdb"))
  pts$SRA_2018 <- get_SRA(pts, file.path(INPUT_PUBLIC, "SRAandFHSZ/HistoricalSRA/SRA_archive/SRA18_2.gdb"))
  pts$SRA_2019 <- get_SRA(pts, file.path(INPUT_PUBLIC, "SRAandFHSZ/HistoricalSRA/SRA_archive/SRA19_1.gdb"))
  pts$SRA_2020 <- get_SRA(pts, file.path(INPUT_PUBLIC, "SRAandFHSZ/HistoricalSRA/SRA_archive/SRA20_1.gdb"))
  pts$SRAcurrent <- get_SRA(pts, file.path(INPUT_PUBLIC, "SRAandFHSZ/State_Responsibility_Area-shp/Responsibility_Areas.shp"))
  
  # Assign SRA status based on combinedyear (year/effective year of construction)
  pts <- pts %>%
    mutate(SRA = case_when(
      !(State %in% c("CA", "California")) ~ "NotCA",
      combinedyear <= 1995 ~ SRA_1990,
      combinedyear %in% 1996:2002 ~ SRA_1996,
      combinedyear %in% 2003:2004 ~ SRA_2003,
      combinedyear %in% 2005:2009 ~ SRA_2005,
      combinedyear %in% 2010:2010 ~ SRA_2010,
      combinedyear %in% 2011:2011 ~ SRA_2011,
      combinedyear %in% 2012:2012 ~ SRA_2012,
      combinedyear %in% 2013:2013 ~ SRA_2013,
      combinedyear %in% 2014:2014 ~ SRA_2014,
      combinedyear %in% 2015:2015 ~ SRA_2015,
      combinedyear %in% 2016:2016 ~ SRA_2016,
      combinedyear %in% 2017:2017 ~ SRA_2017,
      combinedyear %in% 2018:2018 ~ SRA_2018,
      combinedyear %in% 2019:2019 ~ SRA_2019,
      combinedyear >= 2020 ~ SRA_2020,
      is.na(combinedyear) & !is.na(SRAcurrent) ~ SRAcurrent)) # If year built is missing, use current SRA status
  # Sometimes these will be NA if a home's location is in a body of water
  
  # Add FHSZ indicators -----
  pts$hazclass_sra_85 <- get_FHSZ(pts, file.path(INPUT_PUBLIC, "SRAandFHSZ/HistoricalVHFSZ/Historic_FHSZ/FHSZ_historic/fhsz85_4.gdb")) 
  pts <- pts %>%
    mutate(hazclass_sra_85 = case_when(
      hazclass_sra_85 == "VERY_HIGH" ~ "Very High",
      hazclass_sra_85 == "HIGH" ~ "High",
      hazclass_sra_85 == "MODERATE" ~ "Moderate",
      hazclass_sra_85 == "NODATA" ~ "No Data",
      hazclass_sra_85 == "NONSRA" | is.na(hazclass_sra_85) ~ "Not in SRA"
    ))
  
  pts$hazclass_sra_07 <- get_FHSZ(pts, file.path(INPUT_PUBLIC, "SRAandFHSZ/SRA_FHSZ/California_Fire_Hazard_Severity_Zones__FHSZ_.shp"))
  pts <- pts %>% mutate(hazclass_sra_07 = replace_na(hazclass_sra_07, "Not in SRA"))
  
  # History of FHSZs
  # https://osfm.fire.ca.gov/what-we-do/community-wildfire-preparedness-and-mitigation/fire-hazard-severity-zones
  # FIRST SRA MAP: 1985 (after 1980 panorama)
  # FIRST LRA MAP FOLLOWED BATES BILL: 1992? But we don't have that map. 
  # 2007: FRAP FINALIZED MODEL TO USE
  # 2011: LRA MAPPED 38 COUNTIES AND 200 cities
  # Note: We have pretty good reason to think that the 98 LRA map is not very good. We found a PDF of Paradise, CA
  
  pts$LRAFHSZ_1998 <- get_FHSZ(pts, file.path(INPUT_PUBLIC, "SRAandFHSZ/HistoricalVHFSZ/Historic_FHSZ/FHSZ_historic/bates98_3.gdb")) 
  pts <- pts %>%
    mutate(hazclass_lra_98 = case_when(
      LRAFHSZ_1998 %in% c("VH-10", "VH-11", "VH-14", "VH-?") ~ "Very High", # Only "Very High" ratings in LRA
      TRUE ~ "Unrated"))
  
  pts$LRAFHSZ_2008 <- get_FHSZ(pts, file.path(INPUT_PUBLIC, "SRAandFHSZ/LRA_FHSZ/California_Fire_Hazard_Severity_Zones__FHSZ_.shp"))
  pts <- pts %>%
    mutate(hazclass_lra_08 = case_when(
      LRAFHSZ_2008 %in% c("VERY HIGH", "Very High") ~ "Very High", # Only "Very High" ratings in LRA
      TRUE ~ "Unrated"))
  
  # Assign VHFHSZ status
  pts <- pts %>%
    mutate(vh_sra_85 = hazclass_sra_85 == "Very High",
           vh_sra_07 = hazclass_sra_07 == "Very High",
           vh_lra_98 = hazclass_lra_98 == "Very High",
           vh_lra_08 = hazclass_lra_08 == "Very High")
  
  # Note: unlike SRA, we do not use FHSZ status when built. This is because adopting of the FHSZ maps was uneven: many municipalities did not adopt these exact maps, particularly the 1998 version. For example, Paradise, CA was not included in the 1998 map but the town of Paradise did require building to code throughout. When the Camp Fire occured, many of those homes built after 1998 survived. So we take these maps as a rough reference for where the applied, and designate a home as in the LRA-VHFHSZ if they are located within either the 1998 or the 2008 maps.
  # There are only a small number of homes in this sample that actually change status from VH to not-VH between the 1998 and 2008 maps. The vast majority of these are in the Tubbs fire. We later include a robustness check that drops these "lraYN" homes from the sample - it doesn't affect the results.
  
  pts
}

# Function to add a given SRA-year indicator
get_SRA <- function(pts, sra_file) {
  # pts <- pts # sf object of points for which we want to assign SRA
  # sra_file <- file.path(INPUT_PUBLIC,"SRAandFHSZ/State_Responsibility_Area-shp/Responsibility_Areas.shp")
  # END DEBUG
  print(sra_file)
  
  sra_sf = st_read(sra_file) %>%
    st_transform(crs = st_crs(pts)) %>%
    st_cast("MULTIPOLYGON")
  
  # Create a temporary id
  pts$tempid <- 1:nrow(pts)
  
  # Spatial join
  pts <- pts %>% select(geometry, tempid) %>% st_join(sra_sf, left = T)
  
  # For a very very small number of homes that are exactly on the border, pick at random
  pts <- pts %>% distinct(tempid, .keep_all = T) %>% select(-tempid)
  
  # Replace NON with LRA
  pts$SRA[pts$SRA == "NON"] <- "LRA"
  
  pts$SRA
}


# Return a vector of VHFZ indicators
get_FHSZ <- function(pts, fhsz_file) {
  # pts <- homes
  # fhsz_file <- file.path(INPUT_PUBLIC, "SRAandFHSZ/LRA_FHSZ/California_Fire_Hazard_Severity_Zones__FHSZ_.shp")
  # END DEBUG
  print(fhsz_file)
  
  fhsz_sf = st_read(fhsz_file) %>%
    st_transform(crs = st_crs(pts)) %>%
    st_cast("MULTIPOLYGON")
  
  # Create a temporary id
  pts$tempid <- 1:nrow(pts)
  
  # Spatial join
  pts <- pts %>% select(geometry, tempid) %>% st_join(fhsz_sf, left = T)
  
  # For a very very small number of homes that are exactly on the border, pick at random
  pts <- pts %>% distinct(tempid, .keep_all = T) %>% select(-tempid)
  
  # Return relevant variable for HAZ_CLASS (different for SRA / LRA VHFZ files)
  if ("SEVERITY" %in% names(fhsz_sf)) { # for SRA files
    return(pts$SEVERITY)
  } else if ("RATING" %in% names(fhsz_sf)) { # for 1998 LRA
    return(pts$RATING)
  } else if ("HAZ_CLASS" %in% names(fhsz_sf)) { # for 2008 LRA
    return(pts$HAZ_CLASS)
  }
}

get_SMD <- function(x, cat, ref) {
  # Compute standardized mean differences (SMD) in x following Imbens and Rubin (2015) -- who call them "normalized differences".
  # x: numeric vector of variable to compute SMDs for
  # cat: vector indicating treatment assignment
  # ref: reference category for cat (e.g., "Control" or 0)
  
  # x <- rnorm(100)
  # cat <- sample(c("Control", "Treatment"), 100, replace = TRUE)
  # ref <- "Control"
  
  # Get mean and sd for reference category
  x_ref <- mean(x[cat == ref], na.rm = T)
  sd_ref <- sd(x[cat == ref], na.rm = T)
  
  # Get a list of remaining non-reference categories in cat
  cat_others <- unique(cat[cat != ref])
  
  # Compute normalized differences for each non-reference category
  ndiffs <- numeric(length(cat_others))
  for (i in 1:length(cat_others)) {
    x_other <- mean(x[cat == cat_others[i]], na.rm = T)
    sd_other <- sd(x[cat == cat_others[i]], na.rm = T)
    ndiffs[i] <- (x_other - x_ref) / sqrt((sd_ref^2 + sd_other^2) / 2)
    
    # print(x_ref)
    # print(x_other)
    # print(sd_ref)
    # print(sd_other)
  }
  
  
  names(ndiffs) <- as.character(cat_others)
  return(ndiffs)
}

# Example usage:
get_SMD(x = rnorm(100), 
        cat = sample(c("Control", "Treatment"), 100, replace = TRUE), 
        ref = "Control")

get_SMDs <- function(x_vars, data, cat, ref) {
  # Compute standardized mean differences (SMD) for a list of variables in a data frame
  # x_vars: vector of variable names to compute SMDs for
  # data: data frame containing variables
  # cat: vector indicating treatment assignment
  # ref: reference category for cat (e.g., "Control" or 0)
  
  # x_vars <- c("x1", "x2", "x3")
  # data <- data.frame(x1 = rnorm(100), x2 = rnorm(100), x3 = rnorm(100))
  # cat <- sample(c("Control", "Treatment"), 100, replace = TRUE)
  # ref <- "Control"
  
  # Get SMDs for each variable
  smds <- lapply(x_vars, function(x) get_SMD(data[[x]], cat, ref))
  
  names(smds) <- x_vars
  return(smds)
}

# Example usage
# get_SMDs(x_vars = c("x1", "x2", "x3"), 
#          data = data.frame(x1 = rnorm(100), x2 = rnorm(100), x3 = rnorm(100)), 
#          cat = cat <- sample(c("Control", "Treatment"), 100, replace = TRUE),
#          ref = "Control")
